trend_vacc_hb <- daily_vacc_hb %>%
filter(hb_name == "Scotland") %>%
filter(sex =="Total") %>%
filter(age_group == "All vaccinations") %>%
filter(cumulative_number_vaccinated!=0)
#Plot to visualize trend on vaccination.
plot_vaccine <- trend_vacc_hb %>%
ggplot()+
aes(x = date, y = number_vaccinated)+
geom_line(aes(color = dose))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("People Vaccinated") +
xlab("Year") +
ylab("No of Positive Cases") +
color_theme()+
scale_colour_manual(values = c("#f1a340", "#5ab4ac"))
ggplotly(plot_vaccine)
#Plot to visualise cumulative vaccination trend.
plot_vaccine_cumm <- trend_vacc_hb %>%
ggplot()+
aes(x = date, y = cumulative_number_vaccinated)+
geom_line(aes(color = dose))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Cummulative Trend on Vaccination") +
xlab("Year") +
ylab("No of People Vaccinated") +
color_theme()+
scale_colour_manual(values = c("#f1a340", "#5ab4ac"))+
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))
ggplotly(plot_vaccine_cumm)
NA
Data Preparation
trend_vacc_hb <- trend_vacc_hb %>%
filter (dose == "Dose 2") %>%
select(date,cumulative_number_vaccinated)
# Convert it to zoo type
daily_vacc_hb_zoo <- zoo(trend_vacc_hb$cumulative_number_vaccinated,
order.by=as.Date(trend_vacc_hb$date, format='%m/%d/%Y'))
# Convert it into a time series
daily_vacc_hb_timeseries <-timeSeries::as.timeSeries(daily_vacc_hb_zoo)
ARIMA MODEL
Step 1 : Visualize the time series
original_series<-
autoplot(daily_vacc_hb_timeseries, colour = '#5ab4ac')+
xlab("Month") +
ylab("VACCINATED")+
#scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Original Series") +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme()
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
ggplotly(original_series)
Step 2 : Identification of model : (Finding d:)
Identify whether the time series is stationary / non stationary we can use ADF Augmented Dickey-Fuller test
adf_test <- adf.test(daily_vacc_hb_timeseries)
The time series is not stationary since we have a high p-value. So we apply difference
first_diff_ts<- diff(daily_vacc_hb_timeseries)
adf_test1 <- adf.test(na.omit(first_diff_ts))
second_diff_ts<- diff(first_diff_ts)
adf_test2 <- adf.test(na.omit(second_diff_ts))
Warning in adf.test(na.omit(second_diff_ts)) :
p-value smaller than printed p-value
adf_test1
Augmented Dickey-Fuller Test
data: na.omit(first_diff_ts)
Dickey-Fuller = -0.75615, Lag order = 6, p-value = 0.9647
alternative hypothesis: stationary
adf_test2
Augmented Dickey-Fuller Test
data: na.omit(second_diff_ts)
Dickey-Fuller = -6.3311, Lag order = 6, p-value = 0.01
alternative hypothesis: stationary
Create a dataframe to compare
adf_data <- data.frame(Data = c("Original", "First-Ordered", "Second Ordered"),
Dickey_Fuller = c(adf_test$statistic, adf_test1$statistic, adf_test2$statistic),
p_value = c(adf_test$p.value,adf_test1$p.value,adf_test2$p.value))
adf_data
Initially the p-value is high which indicates that the Time Series is not stationary. So we apply difference 2 times. After the second difference, the p-value < significance level (0.05) So we can conclude that the difference data are stationary. So difference (d = 2)
ndiffs(daily_vacc_hb_timeseries)
[1] 2
Let’s plot the First Order and Second Order Difference Series
Order of first difference
first_order<- autoplot(first_diff_ts, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("VACCINATED")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("First-Order Difference") +
color_theme()
ggplotly(first_order)
Order of Second difference
second_order<- autoplot(second_diff_ts, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("VACCINATED")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Second-Order Difference") +
color_theme()
ggplotly(second_order)
For our model ARIMA (p,d,q), we found d = 2, the next step is to get the values of p and q, the order of AR and MA part. Plot ACF and PACF charts to identify q and p respectively.
[1] 0.91 0.87 0.83 0.81 0.81 0.80 0.84 0.77 0.73 0.69 0.67 0.66 0.66 0.69 0.62 0.58 0.55 0.54 0.54 0.56 0.58 0.53 0.50
[24] 0.48 0.46 0.46 0.47
[1] -0.30 0.00 -0.11 -0.08 -0.03 -0.19 0.61 -0.18 -0.05 -0.06 -0.09 -0.04 -0.13 0.53 -0.17 -0.06 -0.12 -0.07 -0.08
[20] -0.05 0.47 -0.17 0.01 -0.09 -0.06 -0.07 -0.09
[1] 0.91 0.25 0.06 0.12 0.15 0.13 0.31 -0.47 -0.10 0.09 0.01 0.05 0.07 0.05 -0.25 -0.04 0.04 0.13 0.09
[20] 0.11 0.03 -0.13 0.03 -0.06 -0.06 -0.01 0.00
[1] -0.30 -0.10 -0.15 -0.18 -0.16 -0.36 0.48 0.12 -0.10 -0.02 -0.06 -0.08 -0.06 0.23 0.03 -0.05 -0.16 -0.12 -0.13
[20] -0.04 0.11 -0.05 0.06 0.07 0.02 0.01 -0.12
The ACF and PACF plots of the differenced data show the following patterns:
The ACF doesn’t follow a sinusoidal pattern but its slowly geometric decay. Also there is a significant spike at lag 3 in the PACF, but none beyond lag 3. So the data may follow an AR(3) model
The PACF is sinusoidal and decaying. Also there is a significant spike at lag 2 in the ACF, but none beyond lag 2 So the data may follow an MA(2) model
So we propose three ARMA models for the differenced data: ARMA(p,q) ARMA(3,2), ARMA(3,0) and ARMA(0,2).
That is, for the original time series, we propose three ARIMA models,ARIMA(p,d,q) ARIMA(3,1,2), ARIMA(3,1,0) and ARMA(3,1,2).
Manual Model:
arima_fit1 = Arima(daily_vacc_hb_timeseries, order = c(3,1,2))
arima_fit2 = Arima(daily_vacc_hb_timeseries, order = c(3,1,0))
arima_fit3 = Arima(daily_vacc_hb_timeseries, order = c(3,1,2))
arima_fit4 = Arima(daily_vacc_hb_timeseries, order = c(3,1,1))
summary(arima_fit1)
Series: daily_vacc_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
-0.4745 0.5445 0.8172 1.3849 0.9136
s.e. 0.0493 0.0411 0.0525 0.0410 0.0478
sigma^2 estimated as 19647590: log likelihood=-2786.96
AIC=5585.93 AICc=5586.23 BIC=5607.82
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 469.8414 4385.654 2519.935 0.5429195 1.997664 0.1854123 -0.154946
summary(arima_fit2)
Series: daily_vacc_hb_timeseries
ARIMA(3,1,0)
Coefficients:
ar1 ar2 ar3
0.6652 0.2214 0.0864
s.e. 0.0589 0.0697 0.0588
sigma^2 estimated as 20056182: log likelihood=-2790.42
AIC=5588.84 AICc=5588.98 BIC=5603.43
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 376.4332 4446.874 2562.049 0.5930404 1.989212 0.188511 -0.01736869
summary(arima_fit3)
Series: daily_vacc_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
-0.4745 0.5445 0.8172 1.3849 0.9136
s.e. 0.0493 0.0411 0.0525 0.0410 0.0478
sigma^2 estimated as 19647590: log likelihood=-2786.96
AIC=5585.93 AICc=5586.23 BIC=5607.82
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 469.8414 4385.654 2519.935 0.5429195 1.997664 0.1854123 -0.154946
summary(arima_fit4)
Series: daily_vacc_hb_timeseries
ARIMA(3,1,1)
Coefficients:
ar1 ar2 ar3 ma1
1.3482 -0.3143 -0.0387 -0.7437
s.e. 0.1062 0.1065 0.0735 0.0869
sigma^2 estimated as 19186522: log likelihood=-2783.63
AIC=5577.27 AICc=5577.48 BIC=5595.51
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 273.3644 4341.649 2503.716 0.6659324 1.916102 0.184219 0.002191562
Forecast the Manual ARIMA model
# Forecast the manual models
future = forecast(arima_fit1, h = 30)
future2 = forecast(arima_fit2, h = 30)
future3 = forecast(arima_fit3, h = 30)
future4 = forecast(arima_fit4, h = 30)
#Plot the forecasted manual models
par(mfrow = c(2,2))
plot(future)
plot(future2)
plot(future3)
plot(future4)
auto_arima_fit_vacc <- auto.arima(daily_vacc_hb_timeseries,
seasonal=FALSE,
stepwise = FALSE,
approximation = FALSE,
trace = TRUE
)
ARIMA(0,2,0) : 5592.195
ARIMA(0,2,1) : 5560.298
ARIMA(0,2,2) : 5556.104
ARIMA(0,2,3) : 5552.488
ARIMA(0,2,4) : 5552.594
ARIMA(0,2,5) : 5549.508
ARIMA(1,2,0) : 5568.45
ARIMA(1,2,1) : 5553.24
ARIMA(1,2,2) : 5555.145
ARIMA(1,2,3) : 5553.818
ARIMA(1,2,4) : 5550.978
ARIMA(2,2,0) : 5567.898
ARIMA(2,2,1) : 5555.002
ARIMA(2,2,2) : 5556.35
ARIMA(2,2,3) : 5540.291
ARIMA(3,2,0) : 5563.565
ARIMA(3,2,1) : 5552.305
ARIMA(3,2,2) : 5509.835
ARIMA(4,2,0) : 5556.091
ARIMA(4,2,1) : 5549.673
ARIMA(5,2,0) : 5551.089
Best model: ARIMA(3,2,2)
summary(auto_arima_fit_vacc)
Series: daily_vacc_hb_timeseries
ARIMA(3,2,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.8243 -0.4487 -0.4144 -1.2766 0.9213
s.e. 0.0598 0.0704 0.0566 0.0358 0.0312
sigma^2 estimated as 16167581: log likelihood=-2748.77
AIC=5509.53 AICc=5509.84 BIC=5531.4
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 13.84351 3971.207 2264.727 0.4282413 1.784024 0.1666346 -0.05914206
Model Selection Criteria :
ARIMA models with minimum AIC, RMSE and MAPE criteria were chosen as the best models. Automated ARIMA confirms that the ARIMA(3, 2, 2) seems good based on AIC
lmtest::coeftest(auto_arima_fit_vacc)
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
ar1 0.824305 0.059779 13.7892 < 2.2e-16 ***
ar2 -0.448740 0.070372 -6.3767 1.81e-10 ***
ar3 -0.414418 0.056614 -7.3200 2.48e-13 ***
ma1 -1.276611 0.035846 -35.6141 < 2.2e-16 ***
ma2 0.921307 0.031208 29.5218 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
All the coefficients are statistically significant.
Let’s plot the diagnostics with the results to make sure the normality and correlation assumptions for the model hold. If the residuals look like white noise, proceed with forecast and prediction, otherwise repeat the model building.
res <- checkresiduals(auto_arima_fit_vacc, theme = color_theme())
Ljung-Box test
data: Residuals from ARIMA(3,2,2)
Q* = 105.98, df = 5, p-value < 2.2e-16
Model df: 5. Total lags used: 10
res
Ljung-Box test
data: Residuals from ARIMA(3,2,2)
Q* = 105.98, df = 5, p-value < 2.2e-16
The ACF plot of the residuals from the ARIMA(3,2,2) model shows that almost auto correlationswith regular interval outlier. A portmanteau test returns a smaller p-value (almost close to Zero), also suggesting that the residuals are white noise.
The residual errors seem fine with near zero mean and uniform variance. Let’s plot the actuals against the fitted values
#Convert the model to dataframe for plotting
daily_vacc_hb_timeseries_data <- fortify(daily_vacc_hb_timeseries) %>%
clean_names() %>%
remove_rownames %>%
rename (date = index,
vacc = data)%>%
mutate(index = seq(1:nrow(daily_vacc_hb_timeseries)))
arima_fit_resid <- ts(daily_vacc_hb_timeseries) - resid(auto_arima_fit_vacc)
arima_fit_data <- fortify(arima_fit_resid) %>%
clean_names() %>%
mutate(data = round(data,2))
fit_existing_data <- daily_vacc_hb_timeseries_data %>%
inner_join(arima_fit_data, by = c("index"))
#plotting the series along with the fitted values
fit_existing_data %>%
ggplot()+
aes(x=date, y = vacc)+
geom_line(color ="#5ab4ac")+
geom_line(aes(x= date, y = data), colour = "red" )+
xlab("Month") +
ylab("Number of People vaccinated")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Fitting the ARIMA model with existing data") +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme()
6 Forecast using the model
Data Preparation :
#Convert the model to dataframe for plotting
forecast_model <- forecast(auto_arima_fit_vacc,level = c(80, 95), h = 60)
forecast_model_data <- fortify(forecast_model) %>%
clean_names() %>%
mutate(data = round(data,2),
fitted= round(fitted,2))
forecast_start_date <- as.Date(max(daily_vacc_hb_timeseries_data$date)+1)
forecast_end_date <- as.Date(forecast_start_date+59)
forecast_data <- forecast_model_data %>%
filter(!(is.na(point_forecast))) %>%
mutate(date = seq(forecast_start_date,forecast_end_date, by =1)) %>%
select(-data,-fitted, -index)
fitted_data <- forecast_model_data %>%
filter(!(is.na(data))) %>%
inner_join(daily_vacc_hb_timeseries_data, by = c("index")) %>%
mutate(date = as.Date(date)) %>%
select(date, data, fitted)
#Plotting the Vaccination series plus the forecast and 95% prediction intervals
annotation <- data.frame(
x = c(as.Date("03-04-2021","%d-%m-%Y"),as.Date("31-10-2021","%d-%m-%Y")),
y = c(1000000,3000000),
label = c("PAST", "FUTURE")
)
#Time series plots for the next 60 days according to best ARIMA models with 80%–95% CI.
fitted_data %>%
ggplot()+
geom_line(aes(x= date, y = data))+
geom_line(aes(x= date, y = fitted), colour = "red" )+
geom_line(aes(x= date, y =point_forecast), data = forecast_data )+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_80, ymax = hi_80),
data = forecast_data, alpha = 0.3, fill = "green")+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_95, ymax = hi_95),
data = forecast_data, alpha = 0.1)+
ggtitle("Forecast") +
xlab("Month") +
ylab("Number of People vaccinated")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
color_theme()+
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
geom_text(data=annotation,
aes( x=x, y=y, label=label),
color="red",
size=4 )+
geom_vline(xintercept =as.Date("08-10-2021","%d-%m-%Y"), linetype = 2)